output.var = params$output.var
transform.abs = FALSE
log.pred = params$log.pred
norm.pred = FALSE
eda = params$eda
algo.forward.caret = params$algo.forward.caret
algo.backward.caret = params$algo.backward.caret
algo.stepwise.caret = params$algo.stepwise.caret
algo.LASSO.caret = params$algo.LASSO.caret
algo.LARS.caret = params$algo.LARS.caret
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
## $ output.var : chr "y3"
## $ log.pred : logi TRUE
## $ eda : logi TRUE
## $ algo.forward.caret : logi FALSE
## $ algo.backward.caret: logi FALSE
## $ algo.stepwise.caret: logi FALSE
## $ algo.LASSO.caret : logi FALSE
## $ algo.LARS.caret : logi FALSE
# Setup Labels
output.var.tr = if (log.pred == TRUE) paste0(output.var,'.log') else output.var.tr = output.var
feat = read.csv('../../Data/features_highprec.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
data.ori = inner_join(feat,labels,by='JobName')
#data.ori = inner_join(feat,select_at(labels,c('JobName',output.var)),by='JobName')
cc = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,] %>% select_at(c(predictors,output.var,'JobName'))
message('Original cases: ',nrow(data.ori))
## Original cases: 10000
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 3020
message('Complete cases: ',nrow(data))
## Complete cases: 6980
summary(dplyr::select_at(data,c('JobName',output.var)))
## JobName y3
## Job_00001: 1 Min. : 95.91
## Job_00002: 1 1st Qu.:118.29
## Job_00003: 1 Median :124.03
## Job_00004: 1 Mean :125.40
## Job_00007: 1 3rd Qu.:131.06
## Job_00008: 1 Max. :193.73
## (Other) :6974
The Output Variable y3 shows right skewness, so will proceed with a log transformation
df=gather(select_at(data,output.var))
ggplot(df, aes(x=value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density()
#stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
ggplot(gather(select_at(data,output.var)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
if(log.pred==TRUE) data[[output.var.tr]] = log(data[[output.var]],10) else
data[[output.var.tr]] = data[[output.var]]
df=gather(select_at(data,c(output.var,output.var.tr)))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=2)
ggplot(gather(select_at(data,c(output.var,output.var.tr))), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
Normalization of y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data[[output.var]])
t
## Best Normalizing transformation with 6980 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 2.9215
## - Box-Cox: 1.4033
## - Log_b(x+a): 2.0169
## - sqrt(x+a): 2.4599
## - exp(x): 749.6365
## - arcsinh(x): 2.0169
## - Yeo-Johnson: 1.1908
## - orderNorm: 1.108
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6980 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 95.913 118.289 124.030 131.059 193.726
qqnorm(data[[output.var]])
qqnorm(predict(t))
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
data$x2byx1 = data$x2/data$x1
data$x6byx5 = data$x6/data$x5
data$x9byx7 = data$x9/data$x7
data$x10byx8 = data$x10/data$x8
data$x14byx12 = data$x14/data$x12
data$x15byx13 = data$x15/data$x13
data$x17byx16 = data$x17/data$x16
data$x19byx18 = data$x19/data$x18
data$x21byx20 = data$x21/data$x20
data$x23byx22 = data$x23/data$x22
data$x1log = log(data$x1)
data$x2log = log(data$x2)
data$x5log = log(data$x5)
data$x6log = log(data$x6)
data$x7log = log(data$x7)
data$x9log = log(data$x9)
data$x8log = log(data$x8)
data$x10log = log(data$x10)
data$x12log = log(data$x12)
data$x14log = log(data$x14)
data$x13log = log(data$x13)
data$x15log = log(data$x15)
data$x16log = log(data$x16)
data$x17log = log(data$x17)
data$x18log = log(data$x18)
data$x19log = log(data$x19)
data$x20log = log(data$x20)
data$x21log = log(data$x21)
data$x22log = log(data$x22)
data$x23log = log(data$x23)
data$x11log = log(data$x11)
data$x1sqinv = 1/(data$x1)^2
data$x5sqinv = 1/(data$x5)^2
data$x7sqinv = 1/(data$x7)^2
data$x8sqinv = 1/(data$x8)^2
data$x12sqinv = 1/(data$x12)^2
data$x13sqinv = 1/(data$x13)^2
data$x16sqinv = 1/(data$x16)^2
data$x18sqinv = 1/(data$x18)^2
data$x20sqinv = 1/(data$x20)^2
data$x22sqinv = 1/(data$x22)^2
predictors
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10" "x11"
## [12] "x12" "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20" "x21" "x22"
## [23] "x23" "stat1" "stat2" "stat3" "stat4" "stat5" "stat6" "stat7" "stat8" "stat9" "stat10"
## [34] "stat11" "stat12" "stat13" "stat14" "stat15" "stat16" "stat17" "stat18" "stat19" "stat20" "stat21"
## [45] "stat22" "stat23" "stat24" "stat25" "stat26" "stat27" "stat28" "stat29" "stat30" "stat31" "stat32"
## [56] "stat33" "stat34" "stat35" "stat36" "stat37" "stat38" "stat39" "stat40" "stat41" "stat42" "stat43"
## [67] "stat44" "stat45" "stat46" "stat47" "stat48" "stat49" "stat50" "stat51" "stat52" "stat53" "stat54"
## [78] "stat55" "stat56" "stat57" "stat58" "stat59" "stat60" "stat61" "stat62" "stat63" "stat64" "stat65"
## [89] "stat66" "stat67" "stat68" "stat69" "stat70" "stat71" "stat72" "stat73" "stat74" "stat75" "stat76"
## [100] "stat77" "stat78" "stat79" "stat80" "stat81" "stat82" "stat83" "stat84" "stat85" "stat86" "stat87"
## [111] "stat88" "stat89" "stat90" "stat91" "stat92" "stat93" "stat94" "stat95" "stat96" "stat97" "stat98"
## [122] "stat99" "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106" "stat107" "stat108" "stat109"
## [133] "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116" "stat117" "stat118" "stat119" "stat120"
## [144] "stat121" "stat122" "stat123" "stat124" "stat125" "stat126" "stat127" "stat128" "stat129" "stat130" "stat131"
## [155] "stat132" "stat133" "stat134" "stat135" "stat136" "stat137" "stat138" "stat139" "stat140" "stat141" "stat142"
## [166] "stat143" "stat144" "stat145" "stat146" "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153"
## [177] "stat154" "stat155" "stat156" "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164"
## [188] "stat165" "stat166" "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175"
## [199] "stat176" "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [210] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196" "stat197"
## [221] "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206" "stat207" "stat208"
## [232] "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216" "stat217"
controlled.vars = colnames(data)[grep("^x",colnames(data))]
stat.vars = colnames(data)[grep("^stat",colnames(data))]
predictors = c(controlled.vars,stat.vars)
predictors
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
## [11] "x11" "x12" "x13" "x14" "x15" "x16" "x17" "x18" "x19" "x20"
## [21] "x21" "x22" "x23" "x2byx1" "x6byx5" "x9byx7" "x10byx8" "x14byx12" "x15byx13" "x17byx16"
## [31] "x19byx18" "x21byx20" "x23byx22" "x1log" "x2log" "x5log" "x6log" "x7log" "x9log" "x8log"
## [41] "x10log" "x12log" "x14log" "x13log" "x15log" "x16log" "x17log" "x18log" "x19log" "x20log"
## [51] "x21log" "x22log" "x23log" "x11log" "x1sqinv" "x5sqinv" "x7sqinv" "x8sqinv" "x12sqinv" "x13sqinv"
## [61] "x16sqinv" "x18sqinv" "x20sqinv" "x22sqinv" "stat1" "stat2" "stat3" "stat4" "stat5" "stat6"
## [71] "stat7" "stat8" "stat9" "stat10" "stat11" "stat12" "stat13" "stat14" "stat15" "stat16"
## [81] "stat17" "stat18" "stat19" "stat20" "stat21" "stat22" "stat23" "stat24" "stat25" "stat26"
## [91] "stat27" "stat28" "stat29" "stat30" "stat31" "stat32" "stat33" "stat34" "stat35" "stat36"
## [101] "stat37" "stat38" "stat39" "stat40" "stat41" "stat42" "stat43" "stat44" "stat45" "stat46"
## [111] "stat47" "stat48" "stat49" "stat50" "stat51" "stat52" "stat53" "stat54" "stat55" "stat56"
## [121] "stat57" "stat58" "stat59" "stat60" "stat61" "stat62" "stat63" "stat64" "stat65" "stat66"
## [131] "stat67" "stat68" "stat69" "stat70" "stat71" "stat72" "stat73" "stat74" "stat75" "stat76"
## [141] "stat77" "stat78" "stat79" "stat80" "stat81" "stat82" "stat83" "stat84" "stat85" "stat86"
## [151] "stat87" "stat88" "stat89" "stat90" "stat91" "stat92" "stat93" "stat94" "stat95" "stat96"
## [161] "stat97" "stat98" "stat99" "stat100" "stat101" "stat102" "stat103" "stat104" "stat105" "stat106"
## [171] "stat107" "stat108" "stat109" "stat110" "stat111" "stat112" "stat113" "stat114" "stat115" "stat116"
## [181] "stat117" "stat118" "stat119" "stat120" "stat121" "stat122" "stat123" "stat124" "stat125" "stat126"
## [191] "stat127" "stat128" "stat129" "stat130" "stat131" "stat132" "stat133" "stat134" "stat135" "stat136"
## [201] "stat137" "stat138" "stat139" "stat140" "stat141" "stat142" "stat143" "stat144" "stat145" "stat146"
## [211] "stat147" "stat148" "stat149" "stat150" "stat151" "stat152" "stat153" "stat154" "stat155" "stat156"
## [221] "stat157" "stat158" "stat159" "stat160" "stat161" "stat162" "stat163" "stat164" "stat165" "stat166"
## [231] "stat167" "stat168" "stat169" "stat170" "stat171" "stat172" "stat173" "stat174" "stat175" "stat176"
## [241] "stat177" "stat178" "stat179" "stat180" "stat181" "stat182" "stat183" "stat184" "stat185" "stat186"
## [251] "stat187" "stat188" "stat189" "stat190" "stat191" "stat192" "stat193" "stat194" "stat195" "stat196"
## [261] "stat197" "stat198" "stat199" "stat200" "stat201" "stat202" "stat203" "stat204" "stat205" "stat206"
## [271] "stat207" "stat208" "stat209" "stat210" "stat211" "stat212" "stat213" "stat214" "stat215" "stat216"
## [281] "stat217"
All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transformation can help (see [Best Normalizator] section)
Histograms
if (eda == TRUE){
cols = c('x11','x18','stat98','x7','stat110')
df=gather(select_at(data,cols))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=3)
# ggplot(gather(select_at(data,cols)), aes(sample=value)) +
# stat_qq()+
# facet_wrap(~key, scales = 'free',ncol=2)
lapply(select_at(data,cols),summary)
}
## $x11
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.000e-08 9.494e-08 1.001e-07 1.001e-07 1.052e-07 1.100e-07
##
## $x18
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.500 3.147 4.769 4.772 6.418 7.999
##
## $stat98
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.998619 -1.551882 -0.015993 -0.005946 1.528405 2.999499
##
## $x7
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.700 1.266 1.854 1.852 2.446 3.000
##
## $stat110
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.999543 -1.496865 -0.002193 -0.004129 1.504273 2.999563
Scatter plot vs. output variable **y3.log
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light green',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=3)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
All indicators have a strong indication of Fat-Tails
if (eda == TRUE){
df=gather(select_at(data,predictors))
ggplot(df, aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
# stat_function(fun = dnorm, n = 100, args = list(mean = mean(df$value), sd = sd(df$value)))
facet_wrap(~key, scales = 'free',ncol=4)
}
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
# https://stackoverflow.com/questions/27034655/how-to-use-dplyrarrangedesc-when-using-a-string-as-column-name
t=as.data.frame(round(cor(dplyr::select(data,-one_of(output.var.tr,'JobName'))
,select_at(data,output.var.tr)),4)) %>%
#rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-y3.log)
rownames_to_column(var='variable') %>% filter(variable != !!output.var) %>% arrange(-!!sym(output.var.tr))
#DT::datatable(t)
message("Top Positive")
#kable(head(arrange(t,desc(y3.log)),20))
kable(head(arrange(t,desc(!!sym(output.var.tr))),20))
message("Top Negative")
#kable(head(arrange(t,y3.log),20))
kable(head(arrange(t,!!sym(output.var.tr)),20))
}
## Top Positive
## Top Negative
| variable | y3.log |
|---|---|
| x18sqinv | -0.3602 |
| x19byx18 | -0.2422 |
| x7sqinv | -0.2093 |
| stat110 | -0.1594 |
| x4 | -0.0603 |
| x16sqinv | -0.0528 |
| x9byx7 | -0.0414 |
| stat13 | -0.0345 |
| stat41 | -0.0345 |
| stat14 | -0.0317 |
| stat149 | -0.0309 |
| stat113 | -0.0279 |
| stat4 | -0.0248 |
| stat106 | -0.0236 |
| stat146 | -0.0236 |
| x8sqinv | -0.0224 |
| stat186 | -0.0217 |
| stat91 | -0.0210 |
| stat214 | -0.0209 |
| stat5 | -0.0207 |
if (eda == TRUE){
#chart.Correlation(select(data,-JobName), pch=21)
t=as.data.frame(round(cor(dplyr::select(data,-one_of('JobName'))),4))
#DT::datatable(t,options=list(scrollX=T))
message("Showing only 10 variables")
kable(t[1:10,1:10])
}
## Showing only 10 variables
| x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | |
|---|---|---|---|---|---|---|---|---|---|---|
| x1 | 1.0000 | 0.0034 | -0.0028 | 0.0085 | 0.0068 | 0.0159 | 0.0264 | -0.0012 | 0.0142 | 0.0013 |
| x2 | 0.0034 | 1.0000 | -0.0057 | 0.0004 | -0.0094 | -0.0101 | 0.0089 | 0.0078 | 0.0049 | -0.0214 |
| x3 | -0.0028 | -0.0057 | 1.0000 | 0.0029 | 0.0046 | 0.0006 | -0.0105 | -0.0002 | 0.0167 | -0.0137 |
| x4 | 0.0085 | 0.0004 | 0.0029 | 1.0000 | -0.0059 | 0.0104 | 0.0098 | 0.0053 | 0.0061 | -0.0023 |
| x5 | 0.0068 | -0.0094 | 0.0046 | -0.0059 | 1.0000 | 0.0016 | -0.0027 | 0.0081 | 0.0259 | -0.0081 |
| x6 | 0.0159 | -0.0101 | 0.0006 | 0.0104 | 0.0016 | 1.0000 | 0.0200 | -0.0157 | 0.0117 | -0.0072 |
| x7 | 0.0264 | 0.0089 | -0.0105 | 0.0098 | -0.0027 | 0.0200 | 1.0000 | -0.0018 | -0.0069 | -0.0221 |
| x8 | -0.0012 | 0.0078 | -0.0002 | 0.0053 | 0.0081 | -0.0157 | -0.0018 | 1.0000 | 0.0142 | -0.0004 |
| x9 | 0.0142 | 0.0049 | 0.0167 | 0.0061 | 0.0259 | 0.0117 | -0.0069 | 0.0142 | 1.0000 | 0.0149 |
| x10 | 0.0013 | -0.0214 | -0.0137 | -0.0023 | -0.0081 | -0.0072 | -0.0221 | -0.0004 | 0.0149 | 1.0000 |
Scatter plots with all predictors and the output variable (y3.log)
if (eda == TRUE){
d = gather(dplyr::select_at(data,c(predictors,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
}
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
No Multicollinearity among predictors
Showing Top predictor by VIF Value
if (eda == TRUE){
vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,75)
}
## Variables VIF
## 1 x16log 3898.657885
## 2 x5log 2548.241629
## 3 x20log 1933.938073
## 4 x16 1794.335885
## 5 x11 1617.835412
## 6 x11log 1617.529796
## 7 x8log 1616.618846
## 8 x5 1181.352484
## 9 x20 899.832193
## 10 x8 760.784111
## 11 x7log 551.537736
## 12 x1log 539.431432
## 13 x16sqinv 449.246006
## 14 x13log 379.929844
## 15 x12log 371.951963
## 16 x18log 316.819341
## 17 x5sqinv 296.949745
## 18 x7 267.551246
## 19 x1 259.322495
## 20 x20sqinv 226.688625
## 21 x8sqinv 190.358246
## 22 x13 186.168906
## 23 x12 185.412904
## 24 x22log 176.081153
## 25 x18 158.571673
## 26 x22 89.101209
## 27 x7sqinv 67.107315
## 28 x1sqinv 65.708942
## 29 x21 57.729060
## 30 x2 48.327951
## 31 x13sqinv 46.881321
## 32 x21log 46.673228
## 33 x12sqinv 45.232723
## 34 x2log 43.040667
## 35 x18sqinv 40.950701
## 36 x23 38.225244
## 37 x23log 35.673242
## 38 x6 34.791262
## 39 x17 28.762744
## 40 x21byx20 23.217635
## 41 x22sqinv 23.209779
## 42 x6log 22.007646
## 43 x17byx16 21.586372
## 44 x6byx5 20.627254
## 45 x9 20.143701
## 46 x19 19.326069
## 47 x10 19.180606
## 48 x14 17.547345
## 49 x2byx1 17.031845
## 50 x10byx8 15.083700
## 51 x19log 14.682239
## 52 x9log 14.527478
## 53 x15 13.261916
## 54 x14log 12.704994
## 55 x17log 12.577187
## 56 x23byx22 12.081108
## 57 x9byx7 11.339508
## 58 x19byx18 10.632290
## 59 x14byx12 10.165072
## 60 x15log 9.057056
## 61 x15byx13 9.035617
## 62 x10log 8.850450
## 63 stat186 1.072106
## 64 stat156 1.070999
## 65 stat66 1.070519
## 66 stat32 1.068938
## 67 stat56 1.068647
## 68 stat105 1.067474
## 69 stat141 1.067352
## 70 stat11 1.067043
## 71 stat31 1.066783
## 72 stat52 1.066482
## 73 stat41 1.066290
## 74 stat178 1.066105
## 75 x3 1.065789
data.tr=data %>%
mutate(x18.sqrt = sqrt(x18))
cols=c('x18','x18.sqrt')
# ggplot(gather(select_at(data.tr,cols)), aes(value)) +
# geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
# geom_density() +
# facet_wrap(~key, scales = 'free',ncol=4)
d = gather(dplyr::select_at(data.tr,c(cols,output.var.tr)),key=target,value=value,-!!output.var.tr)
ggplot(data=d, aes_string(x='value',y=output.var.tr)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#removing unwanted variables
data.tr=data.tr %>%
#dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('x18','y3','JobName')])
dplyr::select_at(names(data.tr)[! names(data.tr) %in% c('JobName')])
data=data.tr
label.names=output.var.tr
# 0 for no interaction,
# 1 for Full 2 way interaction and
# 2 for Selective 2 way interaction
# 3 for Selective 3 way interaction
InteractionMode = 2
pca.vars = names(data)
pca.vars = pca.vars[!pca.vars %in% label.names]
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
if(InteractionMode == 1){
pca.formula =as.formula(paste0('~(',paste0(pca.vars, collapse ='+'),')^2'))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
#saveRDS(pca.model,'pca.model.rds')
}
if (InteractionMode == 0){
pca.model = prcomp(x=data[,pca.vars],center=T,scale.=T,retx = T)
}
if (InteractionMode >= 2 & InteractionMode <= 3){
controlled.vars = pca.vars[grep("^x",pca.vars)]
stat.vars = pca.vars[grep("^stat",pca.vars)]
if (InteractionMode >= 2){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^2')
}
if (InteractionMode >= 3){
interaction.form = paste0('~(',paste0(controlled.vars, collapse ='+'),')^3')
}
no.interact.form = paste0(stat.vars, collapse ='+')
pca.formula = as.formula(paste(interaction.form, no.interact.form, sep = "+"))
pca.model = prcomp(formula=pca.formula,data=data[,pca.vars],center=T,scale.=T,retx = T)
}
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
targetCumVar = .9
pca.model$var = pca.model$sdev ^ 2 #eigenvalues
pca.model$pvar = pca.model$var / sum(pca.model$var)
pca.model$cumpvar = cumsum(pca.model$pvar )
pca.model$pcaSel = pca.model$cumpvar<=targetCumVar
pca.model$pcaSelCount = sum(pca.model$pcaSel)
pca.model$pcaSelTotVar = sum(pca.model$pvar[pca.model$pcaSel])
message(pca.model$pcaSelCount, " PCAs justify ",percent(targetCumVar)," of the total Variance. (",percent(pca.model$pcaSelTotVar),")")
## 164 PCAs justify 90.0% of the total Variance. (90.0%)
plot(pca.model$var,xlab="Principal component", ylab="Proportion of variance explained", type='b')
plot(cumsum(pca.model$pvar ),xlab="Principal component", ylab="Cumulative Proportion of variance explained", ylim=c(0,1), type='b')
screeplot(pca.model,npcs = pca.model$pcaSelCount)
screeplot(pca.model,npcs = pca.model$pcaSelCount,type='lines')
#summary(pca.model)
#pca.model$rotation
#creating dataset
data.pca = dplyr::select(data,!!label.names) %>%
dplyr::bind_cols(dplyr::select(as.data.frame(pca.model$x)
,!!colnames(pca.model$rotation)[pca.model$pcaSel])
)
data.pca = data.pca[sample(nrow(data.pca)),] # randomly shuffle data
split = sample.split(data.pca[,label.names], SplitRatio = 0.8)
data.train = subset(data.pca, split == TRUE)
data.test = subset(data.pca, split == FALSE)
plot.diagnostics <- function(model, train) {
plot(model)
residuals = resid(model) # Plotted above in plot(lm.out)
r.standard = rstandard(model)
r.student = rstudent(model)
df = data.frame(x=predict(model,train),y=r.student)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = 0,size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
df = data.frame(x=predict(model,train),y=r.standard)
p=ggplot(data=df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_hline(yintercept = c(-2,0,2),size=1)+
ylab("Student Residuals") +
xlab("Predicted Values")+
ggtitle("Student Residual Plot")
plot(p)
# Histogram
df=data.frame(r.student)
p=ggplot(data=df,aes(r.student)) +
geom_histogram(aes(y=..density..),bins = 50,fill='blue',alpha=0.6) +
stat_function(fun = dnorm, n = 100, args = list(mean = 0, sd = 1)) +
ylab("Density")+
xlab("Studentized Residuals")+
ggtitle("Distribution of Studentized Residuals")
plot(p)
# http://www.stat.columbia.edu/~martin/W2024/R7.pdf
# Influential plots
inf.meas = influence.measures(model)
# print (summary(inf.meas)) # too much data
# Leverage plot
lev = hat(model.matrix(model))
df=tibble::rownames_to_column(as.data.frame(lev),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=lev)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
ylab('Leverage - check') +
xlab('Index')
plot(p)
# Cook's Distance
cd = cooks.distance(model)
df=tibble::rownames_to_column(as.data.frame(cd),'id')
p=ggplot(data=df,aes(x=as.numeric(id),y=cd)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_text(data=filter(df,cd>15/nrow(train)),aes(label=id),check_overlap=T,size=3,vjust=-.5)+
ylab('Cooks distances') +
geom_hline(yintercept = c(4/nrow(train),0),size=1)+
xlab('Index')
plot(p)
print (paste("Number of data points that have Cook's D > 4/n: ", length(cd[cd > 4/nrow(train)]), sep = ""))
print (paste("Number of data points that have Cook's D > 1: ", length(cd[cd > 1]), sep = ""))
return(cd)
}
# function to set up random seeds
# Based on http://jaehyeon-kim.github.io/2015/05/Setup-Random-Seeds-on-Caret-Package.html
setCaretSeeds <- function(method = "cv", numbers = 1, repeats = 1, tunes = NULL, seed = 1701) {
#B is the number of resamples and integer vector of M (numbers + tune length if any)
B <- if (method == "cv") numbers
else if(method == "repeatedcv") numbers * repeats
else NULL
if(is.null(length)) {
seeds <- NULL
} else {
set.seed(seed = seed)
seeds <- vector(mode = "list", length = B)
seeds <- lapply(seeds, function(x) sample.int(n = 1000000
, size = numbers + ifelse(is.null(tunes), 0, tunes)))
seeds[[length(seeds) + 1]] <- sample.int(n = 1000000, size = 1)
}
# return seeds
seeds
}
train.caret.glmselect = function(formula, data, method
,subopt = NULL, feature.names
, train.control = NULL, tune.grid = NULL, pre.proc = NULL){
if(is.null(train.control)){
train.control <- trainControl(method = "cv"
,number = 10
,seeds = setCaretSeeds(method = "cv"
, numbers = 10
, seed = 1701)
,search = "grid"
,verboseIter = TRUE
,allowParallel = TRUE
)
}
if(is.null(tune.grid)){
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
tune.grid = data.frame(nvmax = 1:length(feature.names))
}
if (method == 'glmnet' && subopt == 'LASSO'){
# Will only show 1 Lambda value during training, but that is OK
# https://stackoverflow.com/questions/47526544/why-need-to-tune-lambda-with-carettrain-method-glmnet-and-cv-glmnet
# Another option for LASSO is this: https://github.com/topepo/caret/blob/master/RegressionTests/Code/lasso.R
lambda = 10^seq(-2,0, length =100)
alpha = c(1)
tune.grid = expand.grid(alpha = alpha,lambda = lambda)
}
if (method == 'lars'){
# https://github.com/topepo/caret/blob/master/RegressionTests/Code/lars.R
fraction = seq(0, 1, length = 100)
tune.grid = expand.grid(fraction = fraction)
pre.proc = c("center", "scale")
}
}
# http://sshaikh.org/2015/05/06/parallelize-machine-learning-in-r-with-multi-core-cpus/
# #cl <- makeCluster(ceiling(detectCores()*0.5)) # use 75% of cores only, leave rest for other tasks
cl <- makeCluster(detectCores()*0.75) # use 75% of cores only, leave rest for other tasks
registerDoParallel(cl)
set.seed(1)
# note that the seed has to actually be set just before this function is called
# settign is above just not ensure reproducibility for some reason
model.caret <- caret::train(formula
, data = data
, method = method
, tuneGrid = tune.grid
, trControl = train.control
, preProc = pre.proc
)
stopCluster(cl)
registerDoSEQ() # register sequential engine in case you are not using this function anymore
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
print("All models results")
print(model.caret$results) # all model results
print("Best Model")
print(model.caret$bestTune) # best model
model = model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-nvmax) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=nvmax,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
# leap function does not support studentized residuals
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
id = rownames(model.caret$bestTune)
# Provides the coefficients of the best model
# regsubsets doens return a full model (see documentation of regsubset), so we need to recalcualte themodel
# https://stackoverflow.com/questions/13063762/how-to-obtain-a-lm-object-from-regsubsets
print("Coefficients of final model:")
coefs <- coef(model, id=id)
#calculate the model to the the coef intervals
nams <- names(coefs)
nams <- nams[!nams %in% "(Intercept)"]
response <- as.character(formula[[2]])
form <- as.formula(paste(response, paste(nams, collapse = " + "), sep = " ~ "))
mod <- lm(form, data = data)
#coefs
#coef(mod)
print(car::Confint(mod))
return(list(model = model,id = id, residPlot = residPlot, residHistogram=residHistogram
,modelLM=mod))
}
if (method == 'glmnet' && subopt == 'LASSO'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
print(model.caret$results)
model=model.caret$finalModel
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-lambda) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=lambda,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
#no interval for glmnet: https://stackoverflow.com/questions/39750965/confidence-intervals-for-ridge-regression
t=coef(model,s=model.caret$bestTune$lambda)
model.coef = t[which(t[,1]!=0),]
print(as.data.frame(model.coef))
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, metricsPlot=metricsPlot ))
}
if (method == 'lars'){
print(model.caret)
print(plot(model.caret))
print(model.caret$bestTune)
# Metrics Plot
dataPlot = model.caret$results %>%
gather(key='metric',value='value',-fraction) %>%
dplyr::filter(metric %in% c('MAE','RMSE','Rsquared'))
metricsPlot = ggplot(data=dataPlot,aes(x=fraction,y=value) ) +
geom_line(color='lightblue4') +
geom_point(color='blue',alpha=0.7,size=.9) +
facet_wrap(~metric,ncol=2,scales='free_y')+
theme_light()
plot(metricsPlot)
# Residuals Plot
dataPlot=data.frame(pred=predict(model.caret,data),res=resid(model.caret))
residPlot = ggplot(dataPlot,aes(x=pred,y=res)) +
geom_point(color='light blue',alpha=0.7) +
geom_smooth(method="lm")+
theme_light()
plot(residPlot)
residHistogram = ggplot(dataPlot,aes(x=res)) +
geom_histogram(aes(y=..density..),fill='light blue',alpha=1) +
#geom_density(color='lightblue4') +
stat_function(fun = dnorm, n = 100, args = list(mean = mean(dataPlot$res)
, sd = sd(dataPlot$res)),color='lightblue4')
theme_light()
plot(residHistogram)
print("Coefficients")
t=coef(model.caret$finalModel,s=model.caret$bestTune$fraction,mode='fraction')
model.coef = t[which(t!=0)]
print(model.coef)
id = NULL # not really needed but added for consistency
return(list(model = model.caret,id = id, residPlot = residPlot, residHistogram=residHistogram))
}
}
# https://stackoverflow.com/questions/48265743/linear-model-subset-selection-goodness-of-fit-with-k-fold-cross-validation
# changed slightly since call[[2]] was just returning "formula" without actually returnign the value in formula
predict.regsubsets <- function(object, newdata, id, formula, ...) {
#form <- as.formula(object$call[[2]])
mat <- model.matrix(formula, newdata) # adds intercept and expands any interaction terms
coefi <- coef(object, id = id)
xvars <- names(coefi)
return(mat[,xvars]%*%coefi)
}
test.model = function(model, test, level=0.95
,draw.limits = FALSE, good = 0.1, ok = 0.15
,method = NULL, subopt = NULL
,id = NULL, formula, feature.names, label.names
,transformation = NULL){
## if using caret for glm select equivalent functionality,
## need to pass formula (full is ok as it will select subset of variables from there)
if (is.null(method)){
pred = predict(model, newdata=test, interval="confidence", level = level)
}
if (method == 'leapForward' | method == 'leapBackward' | method == 'leapSeq'){
pred = predict.regsubsets(model, newdata = test, id = id, formula = formula)
}
if (method == 'glmnet' && subopt == 'LASSO'){
xtest = as.matrix(test[,feature.names])
pred=as.data.frame(predict(model, xtest))
}
if (method == 'lars'){
pred=as.data.frame(predict(model, newdata = test))
}
# Summary of predicted values
print ("Summary of predicted values: ")
print(summary(pred[,1]))
test.mse = mean((test[,label.names]-pred[,1])^2)
print (paste(method, subopt, "Test MSE:", test.mse, sep=" "))
test.rmse = sqrt(test.mse)
print (paste(method, subopt, "Test RMSE:", test.rmse, sep=" "))
if(log.pred == TRUE || norm.pred == TRUE){
# plot transformewd comparison first
df=data.frame(x=test[,label.names],y=pred[,1])
ggplot(df,aes(x=x,y=y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=1,intercept=0,color='black',size=1) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual (Transformed)")+
ylab("Predicted (Transformed)")
}
if (log.pred == FALSE && norm.pred == FALSE){
x = test[,label.names]
y = pred[,1]
}
if (log.pred == TRUE){
x = 10^test[,label.names]
y = 10^pred[,1]
}
if (norm.pred == TRUE){
x = predict(transformation, test[,label.names], inverse = TRUE)
y = predict(transformation, pred[,1], inverse = TRUE)
}
df=data.frame(x,y)
ggplot(df,aes(x,y)) +
geom_point(color='blue',alpha=0.5,shape=20,size=2) +
geom_abline(slope=c(1+good,1-good,1+ok,1-ok)
,intercept=rep(0,4),color=c('dark green','dark green','dark red','dark red'),size=1,alpha=0.8) +
#scale_y_continuous(limits=c(min(df),max(df)))+
xlab("Actual")+
ylab("Predicted")
}
n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")
," ~", paste(n[!n %in% label.names], collapse = " + ")))
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## y3.log ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 +
## PC10 + PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 +
## PC19 + PC20 + PC21 + PC22 + PC23 + PC24 + PC25 + PC26 + PC27 +
## PC28 + PC29 + PC30 + PC31 + PC32 + PC33 + PC34 + PC35 + PC36 +
## PC37 + PC38 + PC39 + PC40 + PC41 + PC42 + PC43 + PC44 + PC45 +
## PC46 + PC47 + PC48 + PC49 + PC50 + PC51 + PC52 + PC53 + PC54 +
## PC55 + PC56 + PC57 + PC58 + PC59 + PC60 + PC61 + PC62 + PC63 +
## PC64 + PC65 + PC66 + PC67 + PC68 + PC69 + PC70 + PC71 + PC72 +
## PC73 + PC74 + PC75 + PC76 + PC77 + PC78 + PC79 + PC80 + PC81 +
## PC82 + PC83 + PC84 + PC85 + PC86 + PC87 + PC88 + PC89 + PC90 +
## PC91 + PC92 + PC93 + PC94 + PC95 + PC96 + PC97 + PC98 + PC99 +
## PC100 + PC101 + PC102 + PC103 + PC104 + PC105 + PC106 + PC107 +
## PC108 + PC109 + PC110 + PC111 + PC112 + PC113 + PC114 + PC115 +
## PC116 + PC117 + PC118 + PC119 + PC120 + PC121 + PC122 + PC123 +
## PC124 + PC125 + PC126 + PC127 + PC128 + PC129 + PC130 + PC131 +
## PC132 + PC133 + PC134 + PC135 + PC136 + PC137 + PC138 + PC139 +
## PC140 + PC141 + PC142 + PC143 + PC144 + PC145 + PC146 + PC147 +
## PC148 + PC149 + PC150 + PC151 + PC152 + PC153 + PC154 + PC155 +
## PC156 + PC157 + PC158 + PC159 + PC160 + PC161 + PC162 + PC163 +
## PC164
print(grand.mean.formula)
## y3.log ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]
model.full = lm(formula , data.train)
summary(model.full)
##
## Call:
## lm(formula = formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.087358 -0.022151 -0.005539 0.016742 0.183529
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.097e+00 4.256e-04 4927.661 < 2e-16 ***
## PC1 -4.916e-04 3.711e-05 -13.247 < 2e-16 ***
## PC2 -9.048e-04 3.746e-05 -24.151 < 2e-16 ***
## PC3 -4.057e-04 3.771e-05 -10.758 < 2e-16 ***
## PC4 -3.306e-04 3.842e-05 -8.604 < 2e-16 ***
## PC5 2.160e-04 3.976e-05 5.432 5.82e-08 ***
## PC6 -1.190e-04 3.967e-05 -3.000 0.002711 **
## PC7 -2.044e-04 4.062e-05 -5.031 5.05e-07 ***
## PC8 -2.539e-05 4.100e-05 -0.619 0.535805
## PC9 -4.763e-05 4.224e-05 -1.128 0.259567
## PC10 -2.211e-07 4.303e-05 -0.005 0.995900
## PC11 -5.259e-04 4.618e-05 -11.389 < 2e-16 ***
## PC12 -4.830e-04 4.877e-05 -9.903 < 2e-16 ***
## PC13 3.304e-04 4.962e-05 6.658 3.05e-11 ***
## PC14 2.939e-04 5.104e-05 5.758 9.01e-09 ***
## PC15 6.677e-06 5.202e-05 0.128 0.897871
## PC16 3.601e-04 5.261e-05 6.846 8.46e-12 ***
## PC17 -1.972e-04 5.531e-05 -3.566 0.000365 ***
## PC18 -3.918e-04 5.803e-05 -6.752 1.61e-11 ***
## PC19 1.147e-04 5.869e-05 1.954 0.050781 .
## PC20 3.518e-04 6.391e-05 5.505 3.86e-08 ***
## PC21 1.265e-04 6.656e-05 1.901 0.057327 .
## PC22 1.391e-04 1.039e-04 1.339 0.180778
## PC23 2.531e-04 1.283e-04 1.973 0.048563 *
## PC24 -8.332e-04 1.496e-04 -5.568 2.70e-08 ***
## PC25 1.139e-04 1.678e-04 0.679 0.497177
## PC26 3.248e-04 1.740e-04 1.867 0.062024 .
## PC27 2.856e-04 1.744e-04 1.638 0.101514
## PC28 5.871e-05 1.771e-04 0.331 0.740287
## PC29 4.344e-04 1.932e-04 2.249 0.024567 *
## PC30 1.398e-04 1.977e-04 0.707 0.479365
## PC31 -2.336e-04 2.086e-04 -1.120 0.262810
## PC32 -9.122e-04 2.128e-04 -4.287 1.85e-05 ***
## PC33 3.803e-04 2.238e-04 1.699 0.089295 .
## PC34 1.115e-03 2.296e-04 4.855 1.24e-06 ***
## PC35 -2.726e-05 2.493e-04 -0.109 0.912946
## PC36 5.617e-05 2.519e-04 0.223 0.823524
## PC37 -3.050e-04 2.573e-04 -1.185 0.235946
## PC38 3.079e-04 2.689e-04 1.145 0.252336
## PC39 -7.455e-05 2.764e-04 -0.270 0.787395
## PC40 -4.995e-05 2.796e-04 -0.179 0.858208
## PC41 -2.331e-04 2.816e-04 -0.828 0.407917
## PC42 -4.659e-05 2.841e-04 -0.164 0.869743
## PC43 -1.844e-05 2.874e-04 -0.064 0.948834
## PC44 3.346e-04 2.899e-04 1.154 0.248386
## PC45 -9.567e-05 2.944e-04 -0.325 0.745180
## PC46 -1.490e-04 2.926e-04 -0.509 0.610644
## PC47 -5.590e-04 2.970e-04 -1.882 0.059903 .
## PC48 2.359e-04 2.981e-04 0.791 0.428800
## PC49 2.398e-04 2.984e-04 0.804 0.421626
## PC50 -2.881e-04 3.045e-04 -0.946 0.344062
## PC51 2.042e-04 3.033e-04 0.673 0.500729
## PC52 1.063e-04 3.046e-04 0.349 0.727111
## PC53 1.690e-04 3.066e-04 0.551 0.581472
## PC54 -1.006e-04 3.084e-04 -0.326 0.744162
## PC55 -1.053e-04 3.089e-04 -0.341 0.733246
## PC56 8.112e-05 3.090e-04 0.262 0.792946
## PC57 -5.237e-04 3.125e-04 -1.676 0.093839 .
## PC58 -1.228e-04 3.166e-04 -0.388 0.698070
## PC59 1.117e-03 3.193e-04 3.499 0.000471 ***
## PC60 -9.892e-05 3.157e-04 -0.313 0.754021
## PC61 -2.203e-05 3.200e-04 -0.069 0.945111
## PC62 -1.716e-04 3.184e-04 -0.539 0.589871
## PC63 -4.774e-04 3.219e-04 -1.483 0.138127
## PC64 -7.676e-04 3.220e-04 -2.384 0.017165 *
## PC65 -2.783e-04 3.250e-04 -0.856 0.391940
## PC66 -4.533e-04 3.248e-04 -1.396 0.162859
## PC67 1.959e-04 3.276e-04 0.598 0.549858
## PC68 9.463e-04 3.299e-04 2.869 0.004138 **
## PC69 1.193e-04 3.303e-04 0.361 0.717953
## PC70 -7.103e-05 3.368e-04 -0.211 0.833005
## PC71 9.273e-04 3.355e-04 2.764 0.005737 **
## PC72 -5.701e-06 3.368e-04 -0.017 0.986497
## PC73 4.690e-04 3.384e-04 1.386 0.165847
## PC74 -6.667e-04 3.385e-04 -1.969 0.048954 *
## PC75 -8.141e-04 3.370e-04 -2.416 0.015737 *
## PC76 9.722e-06 3.379e-04 0.029 0.977049
## PC77 3.307e-04 3.392e-04 0.975 0.329596
## PC78 3.177e-05 3.454e-04 0.092 0.926722
## PC79 4.561e-04 3.494e-04 1.305 0.191808
## PC80 -3.464e-04 3.500e-04 -0.990 0.322367
## PC81 9.733e-04 3.472e-04 2.803 0.005075 **
## PC82 4.893e-04 3.517e-04 1.391 0.164240
## PC83 -7.569e-04 3.536e-04 -2.141 0.032336 *
## PC84 5.380e-04 3.491e-04 1.541 0.123362
## PC85 1.094e-03 3.543e-04 3.089 0.002022 **
## PC86 -1.364e-04 3.565e-04 -0.383 0.702080
## PC87 1.594e-03 3.594e-04 4.436 9.36e-06 ***
## PC88 -5.426e-04 3.613e-04 -1.502 0.133170
## PC89 -2.240e-04 3.562e-04 -0.629 0.529464
## PC90 -5.949e-04 3.599e-04 -1.653 0.098409 .
## PC91 -1.537e-05 3.598e-04 -0.043 0.965923
## PC92 -2.643e-04 3.606e-04 -0.733 0.463564
## PC93 1.554e-04 3.643e-04 0.427 0.669686
## PC94 -7.188e-04 3.672e-04 -1.957 0.050350 .
## PC95 3.473e-04 3.661e-04 0.949 0.342867
## PC96 -6.288e-04 3.668e-04 -1.714 0.086559 .
## PC97 -3.383e-04 3.689e-04 -0.917 0.359252
## PC98 -1.257e-04 3.672e-04 -0.342 0.732051
## PC99 -4.575e-04 3.682e-04 -1.243 0.214040
## PC100 -8.294e-05 3.691e-04 -0.225 0.822225
## PC101 -1.142e-04 3.691e-04 -0.309 0.757081
## PC102 -8.382e-04 3.747e-04 -2.237 0.025338 *
## PC103 4.100e-04 3.673e-04 1.116 0.264363
## PC104 -7.438e-04 3.734e-04 -1.992 0.046407 *
## PC105 8.975e-04 3.707e-04 2.421 0.015506 *
## PC106 8.605e-04 3.727e-04 2.309 0.020994 *
## PC107 2.968e-04 3.722e-04 0.798 0.425136
## PC108 2.150e-04 3.722e-04 0.578 0.563551
## PC109 1.923e-04 3.731e-04 0.516 0.606210
## PC110 -1.934e-04 3.725e-04 -0.519 0.603726
## PC111 -7.475e-04 3.747e-04 -1.995 0.046087 *
## PC112 -8.785e-05 3.760e-04 -0.234 0.815297
## PC113 4.495e-04 3.756e-04 1.197 0.231459
## PC114 -2.992e-04 3.779e-04 -0.792 0.428548
## PC115 -1.811e-03 3.763e-04 -4.812 1.53e-06 ***
## PC116 -4.387e-05 3.801e-04 -0.115 0.908113
## PC117 -3.404e-04 3.760e-04 -0.905 0.365421
## PC118 2.054e-04 3.785e-04 0.543 0.587383
## PC119 -7.629e-04 3.826e-04 -1.994 0.046203 *
## PC120 2.688e-04 3.803e-04 0.707 0.479694
## PC121 -5.136e-04 3.818e-04 -1.345 0.178616
## PC122 5.781e-04 3.818e-04 1.514 0.130080
## PC123 -5.215e-04 3.820e-04 -1.365 0.172170
## PC124 4.218e-04 3.822e-04 1.103 0.269863
## PC125 1.790e-04 3.806e-04 0.470 0.638242
## PC126 9.147e-05 3.837e-04 0.238 0.811604
## PC127 2.697e-04 3.863e-04 0.698 0.485108
## PC128 -8.209e-04 3.866e-04 -2.123 0.033794 *
## PC129 -2.540e-05 3.838e-04 -0.066 0.947232
## PC130 3.811e-04 3.849e-04 0.990 0.322232
## PC131 -1.300e-03 3.845e-04 -3.381 0.000727 ***
## PC132 5.885e-04 3.858e-04 1.526 0.127187
## PC133 -1.527e-04 3.864e-04 -0.395 0.692764
## PC134 7.941e-04 3.876e-04 2.049 0.040523 *
## PC135 5.462e-04 3.914e-04 1.395 0.162941
## PC136 6.525e-04 3.916e-04 1.666 0.095765 .
## PC137 -5.444e-04 3.913e-04 -1.391 0.164142
## PC138 7.592e-04 3.872e-04 1.961 0.049974 *
## PC139 -1.134e-03 3.895e-04 -2.910 0.003629 **
## PC140 -4.688e-05 3.911e-04 -0.120 0.904604
## PC141 -5.084e-05 3.950e-04 -0.129 0.897604
## PC142 1.114e-04 3.938e-04 0.283 0.777264
## PC143 5.313e-04 3.916e-04 1.357 0.174905
## PC144 1.298e-03 3.959e-04 3.280 0.001046 **
## PC145 3.765e-04 3.924e-04 0.960 0.337309
## PC146 8.339e-04 3.965e-04 2.103 0.035507 *
## PC147 -3.049e-04 3.928e-04 -0.776 0.437628
## PC148 -2.049e-04 3.961e-04 -0.517 0.604878
## PC149 1.041e-04 3.982e-04 0.262 0.793671
## PC150 1.805e-04 3.931e-04 0.459 0.646193
## PC151 8.274e-04 4.006e-04 2.065 0.038923 *
## PC152 -4.157e-04 3.980e-04 -1.044 0.296301
## PC153 1.006e-03 3.977e-04 2.528 0.011486 *
## PC154 -1.162e-03 3.992e-04 -2.910 0.003625 **
## PC155 8.994e-04 3.944e-04 2.280 0.022628 *
## PC156 9.157e-04 3.994e-04 2.293 0.021891 *
## PC157 -1.209e-04 4.018e-04 -0.301 0.763535
## PC158 -1.622e-04 3.999e-04 -0.406 0.685090
## PC159 1.968e-03 4.014e-04 4.904 9.68e-07 ***
## PC160 -2.410e-04 3.979e-04 -0.606 0.544683
## PC161 7.655e-04 4.022e-04 1.903 0.057064 .
## PC162 -1.492e-03 3.996e-04 -3.734 0.000191 ***
## PC163 1.087e-03 4.009e-04 2.711 0.006736 **
## PC164 1.072e-04 4.013e-04 0.267 0.789461
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03169 on 5419 degrees of freedom
## Multiple R-squared: 0.26, Adjusted R-squared: 0.2376
## F-statistic: 11.61 on 164 and 5419 DF, p-value: < 2.2e-16
cd.full = plot.diagnostics(model=model.full, train=data.train)
## [1] "Number of data points that have Cook's D > 4/n: 277"
## [1] "Number of data points that have Cook's D > 1: 0"
high.cd = names(cd.full[cd.full > 4/nrow(data.train)])
#save dataset with high.cd flagged
t = data.train %>%
rownames_to_column() %>%
mutate(high.cd = ifelse(rowname %in% high.cd,1,0))
#write.csv(t,file='data_high_cd_flag.csv',row.names = F)
###
data.train2 = data.train[!(rownames(data.train)) %in% high.cd,]
model.full2 = lm(formula , data.train2)
summary(model.full2)
##
## Call:
## lm(formula = formula, data = data.train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.057557 -0.018900 -0.003749 0.016627 0.080149
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.093e+00 3.546e-04 5903.302 < 2e-16 ***
## PC1 -5.299e-04 3.157e-05 -16.786 < 2e-16 ***
## PC2 -8.916e-04 3.135e-05 -28.441 < 2e-16 ***
## PC3 -4.107e-04 3.174e-05 -12.941 < 2e-16 ***
## PC4 -3.598e-04 3.209e-05 -11.212 < 2e-16 ***
## PC5 2.186e-04 3.338e-05 6.548 6.39e-11 ***
## PC6 -9.776e-05 3.317e-05 -2.947 0.003219 **
## PC7 -2.238e-04 3.390e-05 -6.601 4.50e-11 ***
## PC8 -2.868e-05 3.440e-05 -0.834 0.404534
## PC9 9.144e-06 3.531e-05 0.259 0.795683
## PC10 2.363e-05 3.597e-05 0.657 0.511238
## PC11 -5.768e-04 3.858e-05 -14.949 < 2e-16 ***
## PC12 -4.922e-04 4.057e-05 -12.133 < 2e-16 ***
## PC13 3.234e-04 4.137e-05 7.818 6.49e-15 ***
## PC14 2.707e-04 4.261e-05 6.353 2.30e-10 ***
## PC15 -2.028e-05 4.340e-05 -0.467 0.640296
## PC16 3.380e-04 4.381e-05 7.715 1.44e-14 ***
## PC17 -2.067e-04 4.607e-05 -4.486 7.43e-06 ***
## PC18 -3.898e-04 4.829e-05 -8.072 8.53e-16 ***
## PC19 1.123e-04 4.903e-05 2.289 0.022092 *
## PC20 4.121e-04 5.331e-05 7.731 1.28e-14 ***
## PC21 1.153e-04 5.550e-05 2.078 0.037787 *
## PC22 1.873e-04 8.655e-05 2.164 0.030526 *
## PC23 2.807e-04 1.088e-04 2.580 0.009910 **
## PC24 -9.379e-04 1.253e-04 -7.486 8.31e-14 ***
## PC25 1.920e-04 1.410e-04 1.361 0.173550
## PC26 2.321e-04 1.454e-04 1.596 0.110446
## PC27 1.371e-04 1.468e-04 0.934 0.350139
## PC28 -5.134e-05 1.485e-04 -0.346 0.729591
## PC29 4.007e-04 1.612e-04 2.486 0.012945 *
## PC30 1.458e-04 1.662e-04 0.877 0.380546
## PC31 -3.007e-04 1.750e-04 -1.718 0.085812 .
## PC32 -8.379e-04 1.782e-04 -4.701 2.66e-06 ***
## PC33 2.806e-05 1.893e-04 0.148 0.882123
## PC34 1.132e-03 1.920e-04 5.896 3.97e-09 ***
## PC35 1.318e-04 2.107e-04 0.626 0.531659
## PC36 -1.837e-05 2.112e-04 -0.087 0.930702
## PC37 -4.381e-04 2.149e-04 -2.039 0.041497 *
## PC38 4.856e-04 2.244e-04 2.163 0.030559 *
## PC39 4.571e-05 2.421e-04 0.189 0.850220
## PC40 -1.206e-04 2.362e-04 -0.510 0.609728
## PC41 -4.206e-04 2.380e-04 -1.767 0.077230 .
## PC42 2.545e-04 2.402e-04 1.060 0.289373
## PC43 3.162e-04 2.429e-04 1.302 0.193039
## PC44 2.599e-04 2.496e-04 1.041 0.297869
## PC45 6.284e-05 2.483e-04 0.253 0.800179
## PC46 -5.995e-05 2.468e-04 -0.243 0.808073
## PC47 -6.898e-04 2.514e-04 -2.744 0.006097 **
## PC48 3.091e-04 2.513e-04 1.230 0.218647
## PC49 4.311e-04 2.520e-04 1.711 0.087195 .
## PC50 -5.019e-04 2.580e-04 -1.945 0.051788 .
## PC51 2.571e-04 2.578e-04 0.997 0.318759
## PC52 5.198e-05 2.564e-04 0.203 0.839389
## PC53 3.458e-04 2.582e-04 1.339 0.180529
## PC54 -5.624e-05 2.632e-04 -0.214 0.830817
## PC55 -4.819e-04 2.619e-04 -1.840 0.065791 .
## PC56 -4.931e-05 2.620e-04 -0.188 0.850756
## PC57 -6.611e-04 2.637e-04 -2.507 0.012219 *
## PC58 -3.430e-04 2.679e-04 -1.280 0.200462
## PC59 9.591e-04 2.704e-04 3.547 0.000393 ***
## PC60 -3.708e-04 2.671e-04 -1.389 0.165027
## PC61 -1.753e-04 2.692e-04 -0.651 0.514823
## PC62 -6.054e-06 2.706e-04 -0.022 0.982153
## PC63 -2.261e-04 2.724e-04 -0.830 0.406592
## PC64 -6.724e-04 2.715e-04 -2.477 0.013291 *
## PC65 -2.336e-04 2.761e-04 -0.846 0.397406
## PC66 -2.884e-04 2.747e-04 -1.050 0.293794
## PC67 3.082e-04 2.765e-04 1.115 0.265046
## PC68 8.008e-04 2.785e-04 2.876 0.004047 **
## PC69 2.446e-04 2.791e-04 0.876 0.380817
## PC70 2.538e-04 2.834e-04 0.896 0.370508
## PC71 5.409e-04 2.822e-04 1.917 0.055340 .
## PC72 -1.700e-04 2.834e-04 -0.600 0.548573
## PC73 5.263e-04 2.849e-04 1.847 0.064821 .
## PC74 -3.433e-04 2.850e-04 -1.205 0.228399
## PC75 -5.028e-04 2.835e-04 -1.773 0.076244 .
## PC76 -1.954e-04 2.829e-04 -0.691 0.489832
## PC77 3.812e-04 2.849e-04 1.338 0.180895
## PC78 -1.552e-04 2.904e-04 -0.534 0.593186
## PC79 7.164e-04 2.931e-04 2.445 0.014534 *
## PC80 1.071e-05 2.932e-04 0.037 0.970865
## PC81 1.086e-03 2.915e-04 3.725 0.000198 ***
## PC82 2.654e-04 2.944e-04 0.901 0.367433
## PC83 -6.646e-04 2.976e-04 -2.233 0.025605 *
## PC84 6.576e-04 2.933e-04 2.242 0.024986 *
## PC85 1.381e-03 2.983e-04 4.628 3.79e-06 ***
## PC86 9.934e-05 2.979e-04 0.333 0.738813
## PC87 1.402e-03 3.001e-04 4.671 3.07e-06 ***
## PC88 -6.287e-04 3.027e-04 -2.077 0.037873 *
## PC89 -2.408e-04 2.990e-04 -0.805 0.420640
## PC90 -3.430e-04 3.011e-04 -1.139 0.254633
## PC91 -1.663e-04 3.001e-04 -0.554 0.579506
## PC92 5.796e-05 3.013e-04 0.192 0.847455
## PC93 -1.445e-04 3.053e-04 -0.473 0.636113
## PC94 -6.583e-04 3.069e-04 -2.145 0.032010 *
## PC95 3.489e-04 3.066e-04 1.138 0.255093
## PC96 -4.709e-04 3.064e-04 -1.537 0.124353
## PC97 -3.803e-04 3.082e-04 -1.234 0.217255
## PC98 -2.698e-04 3.070e-04 -0.879 0.379431
## PC99 -2.354e-04 3.070e-04 -0.767 0.443229
## PC100 -1.843e-04 3.071e-04 -0.600 0.548428
## PC101 -3.551e-04 3.088e-04 -1.150 0.250231
## PC102 -5.902e-04 3.125e-04 -1.889 0.058943 .
## PC103 3.527e-04 3.076e-04 1.146 0.251687
## PC104 -8.410e-04 3.101e-04 -2.711 0.006721 **
## PC105 1.002e-03 3.111e-04 3.222 0.001281 **
## PC106 7.113e-04 3.120e-04 2.280 0.022641 *
## PC107 2.759e-04 3.120e-04 0.884 0.376495
## PC108 4.919e-05 3.102e-04 0.159 0.874012
## PC109 -1.698e-05 3.117e-04 -0.054 0.956552
## PC110 -1.726e-04 3.100e-04 -0.557 0.577822
## PC111 -9.062e-04 3.137e-04 -2.889 0.003881 **
## PC112 3.727e-05 3.144e-04 0.119 0.905641
## PC113 2.957e-04 3.133e-04 0.944 0.345278
## PC114 -4.010e-04 3.160e-04 -1.269 0.204510
## PC115 -1.869e-03 3.150e-04 -5.933 3.17e-09 ***
## PC116 4.756e-05 3.169e-04 0.150 0.880696
## PC117 -4.535e-05 3.139e-04 -0.145 0.885111
## PC118 -1.428e-04 3.162e-04 -0.451 0.651661
## PC119 -7.253e-04 3.200e-04 -2.267 0.023460 *
## PC120 2.154e-04 3.169e-04 0.680 0.496800
## PC121 -4.914e-04 3.180e-04 -1.546 0.122282
## PC122 4.852e-04 3.178e-04 1.527 0.126863
## PC123 -5.137e-04 3.186e-04 -1.612 0.106989
## PC124 1.422e-04 3.188e-04 0.446 0.655696
## PC125 4.159e-04 3.179e-04 1.308 0.190792
## PC126 2.189e-04 3.195e-04 0.685 0.493151
## PC127 -8.043e-05 3.231e-04 -0.249 0.803455
## PC128 -7.825e-04 3.221e-04 -2.429 0.015166 *
## PC129 -2.447e-05 3.217e-04 -0.076 0.939380
## PC130 2.570e-04 3.205e-04 0.802 0.422642
## PC131 -9.006e-04 3.211e-04 -2.804 0.005061 **
## PC132 2.621e-04 3.217e-04 0.815 0.415327
## PC133 -7.463e-06 3.247e-04 -0.023 0.981665
## PC134 6.092e-04 3.237e-04 1.882 0.059908 .
## PC135 2.463e-04 3.258e-04 0.756 0.449711
## PC136 7.741e-04 3.274e-04 2.365 0.018089 *
## PC137 -6.146e-04 3.254e-04 -1.889 0.058980 .
## PC138 6.358e-04 3.239e-04 1.963 0.049754 *
## PC139 -8.848e-04 3.260e-04 -2.714 0.006671 **
## PC140 -3.999e-04 3.261e-04 -1.226 0.220094
## PC141 2.863e-04 3.285e-04 0.872 0.383460
## PC142 4.217e-04 3.282e-04 1.285 0.198948
## PC143 5.793e-04 3.263e-04 1.775 0.075909 .
## PC144 1.071e-03 3.297e-04 3.248 0.001169 **
## PC145 5.363e-04 3.273e-04 1.639 0.101311
## PC146 1.175e-03 3.296e-04 3.567 0.000365 ***
## PC147 -2.749e-04 3.286e-04 -0.837 0.402873
## PC148 -1.805e-04 3.297e-04 -0.547 0.584114
## PC149 -8.157e-05 3.321e-04 -0.246 0.806023
## PC150 4.283e-04 3.284e-04 1.304 0.192196
## PC151 9.375e-04 3.342e-04 2.805 0.005046 **
## PC152 -3.795e-04 3.324e-04 -1.142 0.253600
## PC153 7.395e-04 3.304e-04 2.238 0.025252 *
## PC154 -7.646e-04 3.344e-04 -2.287 0.022261 *
## PC155 6.418e-04 3.289e-04 1.951 0.051066 .
## PC156 6.895e-04 3.335e-04 2.067 0.038753 *
## PC157 4.481e-05 3.356e-04 0.134 0.893791
## PC158 -7.994e-05 3.352e-04 -0.238 0.811529
## PC159 1.549e-03 3.336e-04 4.643 3.51e-06 ***
## PC160 -1.971e-04 3.339e-04 -0.590 0.554903
## PC161 3.329e-04 3.358e-04 0.992 0.321455
## PC162 -1.688e-03 3.330e-04 -5.070 4.12e-07 ***
## PC163 9.402e-04 3.364e-04 2.795 0.005209 **
## PC164 1.972e-05 3.343e-04 0.059 0.952969
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0257 on 5142 degrees of freedom
## Multiple R-squared: 0.349, Adjusted R-squared: 0.3282
## F-statistic: 16.81 on 164 and 5142 DF, p-value: < 2.2e-16
cd.full2 = plot.diagnostics(model.full2, data.train2)
## [1] "Number of data points that have Cook's D > 4/n: 243"
## [1] "Number of data points that have Cook's D > 1: 0"
# much more normal residuals than before.
# Checking to see if distributions are different and if so whcih variables
# High Leverage Plot
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,target=one_of(label.names))
ggplot(data=plotData, aes(x=type,y=target)) +
geom_boxplot(fill='light blue',outlier.shape=NA) +
scale_y_continuous(name="Target Variable Values",label=scales::comma_format(accuracy=.1)) +
theme_light() +
ggtitle('Distribution of High Leverage Points and Normal Points')
# 2 sample t-tests
plotData = data.train %>%
rownames_to_column() %>%
mutate(type=ifelse(rowname %in% high.cd,'High','Normal')) %>%
dplyr::select(type,one_of(feature.names))
comp.test = lapply(dplyr::select(plotData, one_of(feature.names))
, function(x) t.test(x ~ plotData$type, var.equal = TRUE))
sig.comp = list.filter(comp.test, p.value < 0.05)
sapply(sig.comp, function(x) x[['p.value']])
## PC1 PC11 PC20 PC23 PC24 PC25 PC26 PC27 PC28
## 2.149148e-06 1.129437e-03 1.169599e-02 8.428939e-06 1.032749e-02 2.608645e-04 6.914483e-03 1.668421e-02 3.112747e-02
## PC33 PC44 PC45 PC59 PC75 PC131 PC161
## 7.779543e-03 6.062678e-03 2.070030e-03 4.034192e-02 1.177114e-02 3.468476e-02 6.449866e-03
mm = melt(plotData, id=c('type')) %>% filter(variable %in% names(sig.comp))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=5, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
# Distribution (box) Plots
mm = melt(plotData, id=c('type'))
ggplot(mm,aes(x=type, y=value)) +
geom_boxplot()+
facet_wrap(~variable, ncol=8, scales = 'free_y') +
scale_y_continuous(name="values",label=scales::comma_format(accuracy=.1)) +
ggtitle('Distribution of High Leverage Points and Normal Points')
model.null = lm(grand.mean.formula, data.train)
summary(model.null)
##
## Call:
## lm(formula = grand.mean.formula, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.115198 -0.023854 -0.003319 0.020554 0.190115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0970741 0.0004857 4318 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03629 on 5583 degrees of freedom
Basic: http://www.stat.columbia.edu/~martin/W2024/R10.pdf Cross Validation + Other Metrics: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/
if (algo.forward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
, data = data.train
, method = "leapForward"
, feature.names = feature.names)
model.forward = returned$model
id = returned$id
}
if (algo.forward.caret == TRUE){
test.model(model=model.forward, test=data.test
,method = 'leapForward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.backward.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapBackward"
,feature.names = feature.names)
model.backward = returned$model
id = returned$id
}
if (algo.backward.caret == TRUE){
test.model(model.backward, data.test
,method = 'leapBackward',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.stepwise.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "leapSeq"
,feature.names = feature.names)
model.stepwise = returned$model
id = returned$id
}
if (algo.stepwise.caret == TRUE){
test.model(model.stepwise, data.test
,method = 'leapSeq',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,id = id
,draw.limits = TRUE, transformation = t)
}
if (algo.LASSO.caret == TRUE){
set.seed(1)
tune.grid= expand.grid(alpha = 1,lambda = 10^seq(from=-4,to=-2,length=100))
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "glmnet"
,subopt = 'LASSO'
,tune.grid = tune.grid
,feature.names = feature.names)
model.LASSO.caret = returned$model
}
if (algo.LASSO.caret == TRUE){
test.model(model.LASSO.caret, data.test
,method = 'glmnet',subopt = "LASSO"
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
if (algo.LARS.caret == TRUE){
set.seed(1)
returned = train.caret.glmselect(formula = formula
,data = data.train
,method = "lars"
,subopt = 'NULL'
,feature.names = feature.names)
model.LARS.caret = returned$model
}
if (algo.LARS.caret == TRUE){
test.model(model.LARS.caret, data.test
,method = 'lars',subopt = NULL
,formula = formula, feature.names = feature.names, label.names = label.names
,draw.limits = TRUE, transformation = t)
}
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 knitr_1.20 htmltools_0.3.6 reshape2_1.4.3
## [5] lars_1.2 doParallel_1.0.14 iterators_1.0.10 caret_6.0-81
## [9] leaps_3.0 ggforce_0.1.3 rlist_0.4.6.1 car_3.0-2
## [13] carData_3.0-2 bestNormalize_1.3.0 scales_1.0.0 onewaytests_2.0
## [17] caTools_1.17.1.1 mosaic_1.5.0 mosaicData_0.17.0 ggformula_0.9.1
## [21] ggstance_0.3.1 lattice_0.20-35 DT_0.5 ggiraph_0.6.0
## [25] investr_1.4.0 glmnet_2.0-16 foreach_1.4.4 Matrix_1.2-14
## [29] MASS_7.3-50 PerformanceAnalytics_1.5.2 xts_0.11-2 zoo_1.8-4
## [33] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8 purrr_0.2.5
## [37] readr_1.3.1 tidyr_0.8.2 tibble_1.4.2 ggplot2_3.1.0
## [41] tidyverse_1.2.1 usdm_1.1-18 raster_2.8-4 sp_1.3-1
## [45] pacman_0.5.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.2.0 backports_1.1.3 plyr_1.8.4 lazyeval_0.2.1 splines_3.5.1 mycor_0.1.1
## [7] crosstalk_1.0.0 leaflet_2.0.2 digest_0.6.18 magrittr_1.5 mosaicCore_0.6.0 openxlsx_4.1.0
## [13] recipes_0.1.4 modelr_0.1.2 gower_0.1.2 colorspace_1.3-2 rvest_0.3.2 ggrepel_0.8.0
## [19] haven_2.0.0 crayon_1.3.4 jsonlite_1.5 bindr_0.1.1 survival_2.42-3 glue_1.3.0
## [25] registry_0.5 gtable_0.2.0 ppcor_1.1 ipred_0.9-8 abind_1.4-5 rngtools_1.3.1
## [31] bibtex_0.4.2 Rcpp_1.0.0 xtable_1.8-3 units_0.6-2 foreign_0.8-70 stats4_3.5.1
## [37] lava_1.6.4 prodlim_2018.04.18 htmlwidgets_1.3 httr_1.4.0 RColorBrewer_1.1-2 pkgconfig_2.0.2
## [43] farver_1.1.0 nnet_7.3-12 labeling_0.3 tidyselect_0.2.5 rlang_0.3.1 later_0.7.5
## [49] munsell_0.5.0 cellranger_1.1.0 tools_3.5.1 cli_1.0.1 generics_0.0.2 moments_0.14
## [55] sjlabelled_1.0.17 broom_0.5.1 evaluate_0.12 ggdendro_0.1-20 yaml_2.2.0 ModelMetrics_1.2.2
## [61] zip_2.0.1 nlme_3.1-137 doRNG_1.7.1 mime_0.6 xml2_1.2.0 compiler_3.5.1
## [67] rstudioapi_0.8 curl_3.2 tweenr_1.0.1 stringi_1.2.4 highr_0.7 gdtools_0.1.7
## [73] pillar_1.3.1 data.table_1.11.8 bitops_1.0-6 insight_0.1.2 httpuv_1.4.5 R6_2.3.0
## [79] promises_1.0.1 gridExtra_2.3 rio_0.5.16 codetools_0.2-15 assertthat_0.2.0 pkgmaker_0.27
## [85] withr_2.1.2 nortest_1.0-4 mgcv_1.8-24 hms_0.4.2 quadprog_1.5-5 grid_3.5.1
## [91] rpart_4.1-13 timeDate_3043.102 class_7.3-14 rmarkdown_1.11 shiny_1.2.0 lubridate_1.7.4